{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "dict_keys(['malay', 'kedah', 'johor', 'melaka', 'terengganu', 'sarawak', 'negeri-sembilan', 'kelantan', 'perak', 'pahang', 'sabah'])"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with open('sublanguages.json') as fopen:\n",
    "    lang = json.load(fopen)\n",
    "    \n",
    "lang.keys()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "replace = {'kedah': 'utara'}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, Y = [], []\n",
    "\n",
    "for k in lang.keys():\n",
    "    X.extend(lang[k])\n",
    "    Y.extend([replace.get(k,k)] * len(lang[k]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "assert len(X) == len(Y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 177777/177777 [00:00<00:00, 1339361.70it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "actual_X, actual_Y = [], []\n",
    "\n",
    "for i in tqdm(range(len(X))):\n",
    "    if len(X[i]):\n",
    "        actual_X.append(X[i])\n",
    "        actual_Y.append(Y[i])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(177777, 177777)"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(actual_X), len(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "import numpy as np\n",
    "\n",
    "train_X, test_X, train_Y, test_Y = train_test_split(actual_X, actual_Y, test_size = 0.2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['johor', 'kelantan', 'malay', 'melaka', 'negeri-sembilan',\n",
       "        'pahang', 'perak', 'sabah', 'sarawak', 'terengganu', 'utara'],\n",
       "       dtype='<U15'),\n",
       " array([ 2071,  1951, 79979,  8326,  6187,  2829,  1009,   991,  5972,\n",
       "         3846, 29060]))"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(train_Y, return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(array(['johor', 'kelantan', 'malay', 'melaka', 'negeri-sembilan',\n",
       "        'pahang', 'perak', 'sabah', 'sarawak', 'terengganu', 'utara'],\n",
       "       dtype='<U15'),\n",
       " array([  504,   487, 20021,  2067,  1530,   767,   287,   260,  1429,\n",
       "          993,  7211]))"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "np.unique(test_Y, return_counts=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train-test-sublanguages.json', 'w') as fopen:\n",
    "    json.dump({'train_X': train_X, 'test_X': test_X, 'train_Y': train_Y, 'test_Y': test_Y}, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import boto3\n",
    "\n",
    "bucketName = 'malaya-dataset'\n",
    "Key = 'train-test-sublanguages.json'\n",
    "outPutname = \"language-detection/train-test-sublanguages.json\"\n",
    "\n",
    "s3 = boto3.client('s3')\n",
    "s3.upload_file(Key,bucketName,outPutname)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__label__rus Ты можешь позвонить Тому и сказать, чтобы он немедленно приезжал?\r\n",
      "__label__rus А рыльце-то у него в пушку.\r\n",
      "__label__ind Sebagaimana kita memerlukan udara yang nyaman, juga ikan memerlukan air yang bersih.\r\n",
      "__label__epo Italio havas du montoĉenojn, Alpojn kaj Apeninojn.\r\n",
      "__label__rus Этот ботинок дерьмовый, потому что он из Китая.\r\n",
      "__label__tur Sami bilgisayarların nasıl hackleneceğini bilmiyor.\r\n",
      "__label__rus Президент остался в постели.\r\n",
      "__label__eng I can't remember anything else about Tom.\r\n",
      "__label__eng It's what I wanted.\r\n",
      "__label__ber Sami yessen yelli-s n yiwen n yimsujji.\r\n"
     ]
    }
   ],
   "source": [
    "!head valid.txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 142221/142221 [00:00<00:00, 1050133.99it/s]\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "texts = []\n",
    "\n",
    "for i in tqdm(range(len(train_X))):\n",
    "    texts.append('__label__%s %s'%(train_Y[i], train_X[i]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('train-bahasa-sublanguages.txt', 'w') as fopen:\n",
    "    fopen.write('\\n'.join(texts))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "__label__malay peduli terhadap kebakaran hutan dan lahan\r\n",
      "__label__utara ada la experience naik ambulance drip ayaq botol ambik darah kali alhamdulillah syukur saya bebas denggi\r\n",
      "__label__melaka kl je la minggu ketiga balik perak nak amek kad nikah sekali heheheh balik bp kettuw\r\n",
      "__label__melaka kudap kudapan yg sesuai dlm diet untuk kurus epal jambu batu kiwi sepotong betik pear hijau kacang kuda roti wholemeal tapi berpada mkn jgn melantak plak heheheh rt amp follow saya untuk tips kurus seterusnya\r\n",
      "__label__malay sasha beras\r\n",
      "__label__melaka lacak kiriman shopee express tuh pake sicepat apa gimana deh\r\n",
      "__label__sarawak packing mcm sik balit kuching agik\r\n",
      "__label__malay gapapa aku kuat lihat ya inimalam minggu\r\n",
      "__label__malay kotak tuh kalau gk salah pas tahun an berarti festa ke\r\n",
      "__label__sarawak pengedar narkoba intai generasi muda sekda siak himbau orangtua pantau kegiatan anak goriau\r\n"
     ]
    }
   ],
   "source": [
    "!head train-bahasa-sublanguages.txt"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
